{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "import lightgbm as lgb\n",
    "import xgboost as xgb\n",
    "from sklearn.preprocessing import LabelEncoder,OneHotEncoder\n",
    "from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,HashingVectorizer\n",
    "from sklearn.decomposition import TruncatedSVD,SparsePCA\n",
    "from sklearn.model_selection import KFold,StratifiedKFold\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.metrics import accuracy_score,roc_auc_score,f1_score,recall_score\n",
    "\n",
    "import gc\n",
    "import time\n",
    "import os\n",
    "import sys\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "pickle_path = \"../pickle\"\n",
    "active = pd.read_pickle(\"{}/user_app_active.pickle\".format(pickle_path))\n",
    "usage = pd.read_pickle(\"{}/user_app_seq.pickle\".format(pickle_path))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "vector_path = \"../vector\"\n",
    "from glove import *\n",
    "\n",
    "for i,df in enumerate([active,usage]): # 预计耗时 222 + 2100s\n",
    "    t1 = time.time()\n",
    "    c = Corpus()\n",
    "    c.fit(df['appid'].values)\n",
    "    glove = Glove(no_components=300, learning_rate=0.05) \n",
    "    glove.fit(c.matrix,epochs=12,no_threads=30,verbose=1)\n",
    "    glove.add_dictionary(c.dictionary)\n",
    "    glove.save(\"{}/{}_glove300.model\".format(vector_path,i))\n",
    "    print(time.time()-t1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1511.6713757514954\n"
     ]
    }
   ],
   "source": [
    "# Gen W2V Vector\n",
    "from gensim import models\n",
    "\n",
    "for i,df in enumerate([active,usage]): # 预计耗时 1511 + 5874\n",
    "    t1 = time.time()\n",
    "    w2v = models.Word2Vec(df['appid'].values, size=300, window=20, workers=40,hs=1) # 设置sg的话 变成skip-gram方法 我们测试效果差不多\n",
    "    w2v.wv.save_word2vec_format(\"{}/{}_w2v300.model\".format(vector_path,i))\n",
    "    print(time.time()-t1)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "t = []\n",
    "c = []\n",
    "for i,df in enumerate([active,usage]): # 预计耗时 200 + 750s\n",
    "    t1 = time.time()\n",
    "    tfidf = TfidfVectorizer(analyzer='word',token_pattern=u\"(?u)\\\\b\\\\w+\\\\b\",min_df=1,ngram_range=(1,1))\n",
    "    t.append(tfidf.fit_transform(df['appid'].map(lambda x:' '.join(x)).values))\n",
    "    cv = CountVectorizer(analyzer='word',token_pattern=u\"(?u)\\\\b\\\\w+\\\\b\",min_df=1,ngram_range=(1,1))\n",
    "    c.append(cv.fit_transform(df['appid'].map(lambda x:' '.join(x)).values))\n",
    "    \n",
    "    print(time.time()-t1)\n",
    "\n",
    "from scipy import sparse\n",
    "if not os.path.exists(\"{}/Sparse_Matrix\".format(vector_path)):\n",
    "    os.mkdir(\"{}/Sparse_Matrix\".format(vector_path))\n",
    "sparse.save_npz('{}/Sparse_Matrix/active_tfidf.npz'.format(vector_path), t[0])\n",
    "sparse.save_npz('{}/Sparse_Matrix/usage_tfidf.npz'.format(vector_path), t[1])\n",
    "\n",
    "sparse.save_npz('{}/Sparse_Matrix/active_count.npz'.format(vector_path), c[0])\n",
    "sparse.save_npz('{}/Sparse_Matrix/usage_count.npz'.format(vector_path), c[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
